#!/usr/bin/env python3
#
#  Process the top publishers for each journal subject 
#  
#  Copyright 2025 Eko Didik Widianto <didik@live.undip.ac.id>
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import sys
from pathlib import Path    # Path info
import re

import pandas as pd     # Pandas

import numpy as np

# Path definition
path_abs = Path(__file__).parent.absolute() # Use absolute path
sys.path.append(f"{path_abs}/..") # Adds higher directory to python modules path
in_path = f"{path_abs}/data_master/"
report_path = f'{path_abs}/reporting/'
result_path = f'{path_abs}/results/'

# Own functions
from util.utils import save_file, open_json_file, save_aprofile

# Data file: Name, master journal data, master publisher data
data_file = [['Garuda', 'gjournal'],
  ['SINTA', 'sjournal']]

def main():  
  print("Top publishers")
  print("================")
  
  jprofiles = []; pprofiles = []; rprofiles = []
  
  cols = ['id', 'subj_areas', 'title', 'institution', 'pissn', 'eissn']

  for i, mdata in enumerate(data_file):
    src, jour_src = mdata
    
    jprofile = {}
    data_subjs = {}
    
    jdf = open_json_file(in_path + jour_src + '.json')
    jdf = jdf[jdf['is_active']][cols]   # Filtered by journal types

    jdf = jdf[jdf['subj_areas'].notna()] # Filter only journals with subject(s)    
    jdf_ex = jdf.explode(['subj_areas'], ignore_index=True).sort_values(by=['subj_areas']).reset_index(drop=True)
    jdf_ex = jdf_ex.fillna('')
    
    jdf_ex['title_issn'] = jdf_ex.apply(lambda x: x['title'] + '; eISSN: ' + (x['eissn'] )+ "; pISSN:" + x['pissn'],  axis=1)
    
    jdf_gr = jdf_ex.groupby(['subj_areas', 'institution']).agg({'title':'count', 'title_issn':list}).rename(columns={'title':'ntitle'}).sort_values(['subj_areas', 'ntitle'], ascending=[True, False]).reset_index()
    
    save_file(jdf_gr, f'{result_path}/02-{jour_src}_top-publisher', 'Top publishers by subjects')
    
    subj_areas = jdf_ex['subj_areas'].unique()
    
    # Profiling
    for subj in subj_areas:
      pprofile = {'src': jour_src}
      pprofile['nsrc'] = len(jdf_ex.index)
      jdf_ = jdf_gr[jdf_gr['subj_areas'] == subj]
      pprofile['subj'] = subj
      pprofile['nsubj'] = jdf_['ntitle'].sum()
      pprofile['nsubjp'] = round(100 * pprofile['nsubj'] / pprofile['nsrc'], 2)
      
      pprofile['nsubjs'] = f"{pprofile['nsubj']:,d} ({pprofile['nsubjp']:.2f})"
     
      pprofile['publisher'] = jdf_.iloc[0]['institution']
      
      pprofile['npsubj'] = jdf_.iloc[0]['ntitle']
      
      pprofile['npsubjp'] = round(100 * pprofile['npsubj'] / pprofile['nsubj'], 2)
      pprofile['npsubjs'] = f"{pprofile['npsubj']:,d} ({pprofile['npsubjp']:.2f})"
      
      pprofile['publisher2'] = jdf_.iloc[1]['institution']
      
      pprofile['npsubj2'] = jdf_.iloc[1]['ntitle']
      
      pprofile['npsubj2p'] = round(100 * pprofile['npsubj2'] / pprofile['nsubj'], 2)
      pprofile['npsubj2s'] = f"{pprofile['npsubj2']:,d} ({pprofile['npsubj2p']:.2f})"
      
      pprofile['npother'] = pprofile['nsubj'] - pprofile['npsubj']
      pprofile['npotherp'] = round(100 * pprofile['npother'] / pprofile['nsubj'], 2)
      pprofile['npothers'] = f"{pprofile['npother']:,d} ({pprofile['npotherp']:.2f})"
      pprofile['nothers'] = len(jdf_.index) - 1
      
      pprofile['npub'] = len(jdf_.index)
      pprofile['npub_avg'] = round(jdf_['ntitle'].mean(), 2)
      jq75, jq50, jq25 = np.percentile(jdf_['ntitle'], [75, 50, 25])
      pprofile['npub_med'] = jq50
      pprofile['npub_iqr'] = jq75 - jq25
      
      for i in range(2):
        rprofile = {'src': jour_src}
        rprofile['subj']= subj
        rprofile['nsubjs'] = pprofile['nsubjs'] if i == 0 else ''
        rprofile['publisher'] = pprofile['publisher'] if i == 0 else f"Others (n={pprofile['nothers']})"
        rprofile['npsubjs'] = pprofile['npsubjs'] if i == 0 else pprofile['npothers']
        rprofiles.append(rprofile)
      
      pprofiles.append(pprofile)
    
    # Profiling
    jprofile['src'] = jour_src
    jprofile['njournals'] = len(jdf.index)
    jprofile['nsubjects'] = len(jdf_ex.index)

    jprofiles.append(jprofile)

  save_aprofile(jprofiles, report_path + "02-rep_subjs", src="records", desc="02. Top publisher by journal-subject count")
  save_aprofile(pprofiles, report_path + "02-pub_subjs", src="records", desc="02. Top publisher by journal-subject count")
  save_aprofile(rprofiles, report_path + "02-pub_subjs-rep", src="records", desc="02. Top publisher by journal-subject count")
  
# Run the script
if __name__ == '__main__':
  main()
